# List all objects in the environment
objects_to_remove <- ls()
# Remove all objects
rm(list = objects_to_remove)
#install.packages(c("forecast", "expsmooth", "seasonal"))
library(TTR)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tseries)
library(expsmooth)
library(fpp2)
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
## ✔ ggplot2 3.4.2 ✔ fma 2.5
##
library(seasonal)
library(MASS)
##
## Attaching package: 'MASS'
## The following objects are masked from 'package:fma':
##
## cement, housing, petrol
library(stats)
library(TSA)
## Registered S3 methods overwritten by 'TSA':
## method from
## fitted.Arima forecast
## plot.Arima forecast
##
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
##
## acf, arima
## The following object is masked from 'package:utils':
##
## tar
library(forecast)
library(ggplot2)
library(tseries)
library(imputeTS)
##
## Attaching package: 'imputeTS'
## The following object is masked from 'package:tseries':
##
## na.remove
library(vars)
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following object is masked from 'package:imputeTS':
##
## na.locf
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: urca
## Loading required package: lmtest
library(timetk)
library(Metrics)
##
## Attaching package: 'Metrics'
## The following object is masked from 'package:forecast':
##
## accuracy
library(lmtest)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(xts)
library(dplyr)
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
##
## first, last
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
## dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: /opt/X11/lib/libSM.6.dylib
## Referenced from: <FF564E7B-F7DD-3BAE-972C-DE65F8735FC9> /Library/Frameworks/R.framework/Versions/4.2/Resources/modules/R_X11.so
## Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/X11/lib/libSM.6.dylib' (no such file), '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk1.8.0_241.jdk/Contents/Home/jre/lib/server/libSM.6.dylib' (no such file)
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)),
## stdout = TRUE): running command ''/usr/bin/otool' -L
## '/Library/Frameworks/R.framework/Resources/library/tcltk/libs//tcltk.so'' had
## status 1
## Could not load tcltk. Will use slower R code instead.
## Loading required package: RSQLite
path <- "/Users/nikithagopal/Desktop/MSCA Data/Time Series Final/"
pm2_5_data <- read.csv(paste0(path, "la_pm2_5_pollutant_data.csv"),
na.strings=c("", "NA"))
ozone_data <- read.csv(paste0(path, "la_ozone_pollutant_data.csv"),
na.strings=c("", "NA"))
#path <- "/Users/sydneypeters/Desktop/"
#pm2_5_data <- read.csv(paste0(path, "pm2.5_LA.csv"),
# na.strings=c("", "NA"))
#ozone_data <- read.csv(paste0(path, "ozone_LA.csv"),
# na.strings=c("", "NA"))
# Convert data into ts objects
pm2_5_ts <- ts(pm2_5_data$PM2.5.AQI.Value, start = c(1999,1,1), frequency = 365.25)
ozone_ts <- ts(ozone_data$Ozone.AQI.Value, start = c(1999,1,1), frequency = 365.25)
plot(pm2_5_ts, main="Los Angeles PM 2.5 AQI")
plot(ozone_ts, main="Los Angeles Ozone AQI")
kpss.test(pm2_5_ts)
## Warning in kpss.test(pm2_5_ts): p-value smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: pm2_5_ts
## KPSS Level = 6.951, Truncation lag parameter = 12, p-value = 0.01
# The reported p-value is 0.01, which is smaller than 0.05, and would suggest
# that we reject the null hypothesis of level stationarity and conclude that
# there is evidence that the data is non-stationary
adf.test(pm2_5_ts)
## Warning in adf.test(pm2_5_ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: pm2_5_ts
## Dickey-Fuller = -14.554, Lag order = 20, p-value = 0.01
## alternative hypothesis: stationary
# Similarly the Augumented Dickey-Fuller test results in p-value of 0.01 (<0.05)
# where we do reject the null hypothesis that the time series is non-stationary
# and thus data is stationary.
# These are opposite results so we use ACF PACF plots to see if the series is stationary
acf(pm2_5_ts, main = "ACF of Original Time Series")
pacf(pm2_5_ts, main = "PACF of Original Time Series")
#### Step 3: Prepare data for Weekly & Monthly extraction and
EDA
# Create data with Dates
pm2_5_df <- as.data.frame(pm2_5_ts)
pm2_5_df$Date <- mdy(pm2_5_data$Date)
colnames(pm2_5_df) <- c("PM2.5.AQI.Value", "Date")
# Convert to xts format for weekly & monthly
pm2_5_xts <- as.xts(pm2_5_df, order.by=pm2_5_df$Date)
pm2_5_xts <- pm2_5_xts[,-2]
# Let's see how seasonality looks over time and is the variance changing
pm2_5_df$Month <- month(pm2_5_df$Date)
pm2_5_df$Year <- year(pm2_5_df$Date)
avg_aqi_by_month_year <- pm2_5_df %>%
group_by(pm2_5_df$Year, pm2_5_df$Month) %>%
summarise(
avg_value = mean(PM2.5.AQI.Value)
)
## `summarise()` has grouped output by 'pm2_5_df$Year'. You can override using the
## `.groups` argument.
colnames(avg_aqi_by_month_year) <- c("Year", "Month", "avg_value")
reshape_avg_aqi_by_month_year <-
sqldf(
"SELECT
Month,
MAX(CASE WHEN Year = 1999 THEN avg_value END) AS Year_1999,
MAX(CASE WHEN Year = 2000 THEN avg_value END) AS Year_2000,
MAX(CASE WHEN Year = 2001 THEN avg_value END) AS Year_2001,
MAX(CASE WHEN Year = 2002 THEN avg_value END) AS Year_2002,
MAX(CASE WHEN Year = 2003 THEN avg_value END) AS Year_2003,
MAX(CASE WHEN Year = 2004 THEN avg_value END) AS Year_2004,
MAX(CASE WHEN Year = 2005 THEN avg_value END) AS Year_2005,
MAX(CASE WHEN Year = 2006 THEN avg_value END) AS Year_2006,
MAX(CASE WHEN Year = 2007 THEN avg_value END) AS Year_2007,
MAX(CASE WHEN Year = 2008 THEN avg_value END) AS Year_2008,
MAX(CASE WHEN Year = 2009 THEN avg_value END) AS Year_2009,
MAX(CASE WHEN Year = 2010 THEN avg_value END) AS Year_2010,
MAX(CASE WHEN Year = 2011 THEN avg_value END) AS Year_2011,
MAX(CASE WHEN Year = 2012 THEN avg_value END) AS Year_2012,
MAX(CASE WHEN Year = 2013 THEN avg_value END) AS Year_2013,
MAX(CASE WHEN Year = 2014 THEN avg_value END) AS Year_2014,
MAX(CASE WHEN Year = 2015 THEN avg_value END) AS Year_2015,
MAX(CASE WHEN Year = 2016 THEN avg_value END) AS Year_2016,
MAX(CASE WHEN Year = 2017 THEN avg_value END) AS Year_2017,
MAX(CASE WHEN Year = 2018 THEN avg_value END) AS Year_2018,
MAX(CASE WHEN Year = 2019 THEN avg_value END) AS Year_2019,
MAX(CASE WHEN Year = 2020 THEN avg_value END) AS Year_2020,
MAX(CASE WHEN Year = 2021 THEN avg_value END) AS Year_2021,
MAX(CASE WHEN Year = 2022 THEN avg_value END) AS Year_2022,
MAX(CASE WHEN Year = 2023 THEN avg_value END) AS Year_2023
FROM avg_aqi_by_month_year
GROUP BY Month
ORDER BY Month"
)
colnames(reshape_avg_aqi_by_month_year) <- c(
"Month", "1999", "2000", "2001", "2002", "2003", "2004", "2005", "2006",
"2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015",
"2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023")
boxplot(reshape_avg_aqi_by_month_year[2:26])
# We can see that over the years there is downward trend and variance is decreasing
# except 2020 when we see a high peak likely caused by wildfires
# Link: https://en.wikipedia.org/wiki/Lake_Fire_(2020)
pm2_5_weekly <- apply.weekly(pm2_5_xts, mean)
pm2_5_weekly_ts <- ts(pm2_5_weekly["19990103/20230811"],
start = c(1999,1),
frequency = 52.18)
plot(pm2_5_weekly_ts)
# Strong seasonality, Strong cyclic, light downward trend, variance is reducing
kpss.test(pm2_5_weekly_ts)
## Warning in kpss.test(pm2_5_weekly_ts): p-value smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: pm2_5_weekly_ts
## KPSS Level = 3.4904, Truncation lag parameter = 7, p-value = 0.01
adf.test(pm2_5_weekly_ts)
## Warning in adf.test(pm2_5_weekly_ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: pm2_5_weekly_ts
## Dickey-Fuller = -8.4636, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
acf(pm2_5_weekly_ts, main = "ACF of Weekly Time Series")
pacf(pm2_5_weekly_ts, main = "PACF of Weekly Time Series")
# Split the data into a training and test dataset
train_weekly <- window(pm2_5_weekly_ts, start = c(1999,1), end=c(2020,52))
test_weekly <- window(pm2_5_weekly_ts, start = c(2021,1))
# Looking at spectral analysis
periodogram(train_weekly, log = "no", plot=TRUE,
ylab = "Periodogram",
xlab = "Frequency",
lwd=2, xlim = c(0, 0.06))
# There are two trends:
# 1) A typical yearly (52 weeks) seasonal trend
# 2) A trend that is repeating every 9.6 years (500 weeks)
# 3) A typical half yearly (26 weeks) seasonal trend
# Overall, there is no mixed effect that normal seasonality model cannot capture.
# Decompose the series
plot(decompose(train_weekly, type="multiplicative"))
# Important Inferences
# 1) The PM2.5 AQI has been decreasing overall though there are rise every now and then.
# However the trend is going down with time.
# 2) Winter months have a strong seasonal effect with Nov and Dec being the peak months
# usually which likely could be due to cold temperatures causing pollutant to
# not escape the lower atmosphere.
# Link: https://www.accuweather.com/en/health-wellness/why-air-pollution-is-worse-in-winter/689434#:~:text=Cold%20air%20is%20denser%20and%20moves%20slower%20than%20warm%20air,rate%20than%20during%20the%20summer.
# 3) We can see a seasonal cycle of 12 months where the mean value of each month starts
# with a increasing trend in the beginning of the year and drops down towards
# the end of the year. We can see a seasonal effect with a cycle of 12 months.
# Understand the seasonality and remove it to see the trend
train_weekly_diff <- diff(train_weekly, lag = 52)
autoplot(train_weekly_diff, ylab="Train Seasonal Differencing - Weekly Data")
# Let's look if the series is stationary?
kpss.test(train_weekly_diff)
## Warning in kpss.test(train_weekly_diff): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: train_weekly_diff
## KPSS Level = 0.17702, Truncation lag parameter = 7, p-value = 0.1
# The reported p-value is 0.1, which is > 0.05, and would suggest that we fail
# to reject the null hypothesis of level stationarity (conclusion: stationary)
adf.test(train_weekly_diff)
## Warning in adf.test(train_weekly_diff): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: train_weekly_diff
## Dickey-Fuller = -8.5534, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
# The reported p-value of 0.01 (<0.05) so we do reject the null hypothesis that
# the time series is non-stationary (conclusion: stationary)
acf(train_weekly_diff, main = "ACF of Seasonal Differencing Time Series - Weekly Data")
pacf(train_weekly_diff, main = "PACF of Seasonal Differencing Time Series - Weekly Data")
# Forecast the seasonal naïve for (2021-01 to 2023-07)
forecast_snaive_weekly <- snaive(train_weekly, h=110)
## Warning in lag.default(y, -lag): 'k' is not an integer
# Plot the forecasts for snaive model
plot(forecast_snaive_weekly, main = "PM 2.5 AQI Forecast - SNaive (Weekly)",
xlab = "Week", ylab = "PM 2.5 AQI")
lines(test_weekly)
# Compare the forecast with the actual test data by calculating MAPE and MSE
# Mean Absolute Percentage Error (MAPE)
mape_snaive_weekly <- mean(abs((test_weekly - forecast_snaive_weekly$mean)/test_weekly))
mape_snaive_weekly
## [1] 0.2749545
# Mean Absolute Error (MAE)
mae_snaive_weekly <- mean(abs((test_weekly - forecast_snaive_weekly$mean)))
mae_snaive_weekly
## [1] 16.28571
# Mean Squared Error (MSE)
mse_snaive_weekly <- mean((test_weekly - forecast_snaive_weekly$mean)^2)
mse_snaive_weekly
## [1] 599.6278
# Evaluate the residuals
checkresiduals(forecast_snaive_weekly)
##
## Ljung-Box test
##
## data: Residuals from Seasonal naive method
## Q* = 509.98, df = 104.36, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 104.36
# Create monthly data
pm2_5_monthly <- apply.monthly(pm2_5_xts, mean)
pm2_5_monthly_ts <- ts(pm2_5_monthly["19990131/20230811"], start = c(1999,1), frequency = 12)
plot(pm2_5_monthly_ts)
kpss.test(pm2_5_monthly_ts)
## Warning in kpss.test(pm2_5_monthly_ts): p-value smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: pm2_5_monthly_ts
## KPSS Level = 1.9774, Truncation lag parameter = 5, p-value = 0.01
adf.test(pm2_5_monthly_ts)
## Warning in adf.test(pm2_5_monthly_ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: pm2_5_monthly_ts
## Dickey-Fuller = -7.4466, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
acf(pm2_5_monthly_ts, main = "ACF of Monthly Time Series")
pacf(pm2_5_monthly_ts, main = "PACF of Monthly Time Series")
# Split the data into a training and test dataset
train_monthly <- window(pm2_5_monthly_ts, start = c(1999,1), end=c(2020,12))
test_monthly <- window(pm2_5_monthly_ts, start = c(2021,1))
# Looking at spectral analysis
periodogram(train_monthly, log = "no", plot=TRUE,
ylab = "Periodogram",
xlab = "Frequency",
lwd=2, xlim = c(0, 0.2))
# There are two trends:
# 1) A typical yearly (12 months) seasonal trend
# 2) A trend that is repeating every 8-9 years (100 months)
# 3) A typical half yearly (6 months) seasonal trend
# Overall, there is no mixed effect that normal seasonality model cannot capture.
# Decompose the series
plot(decompose(train_monthly, type="multiplicative"))
# Important Inferences
# 1) The PM2.5 AQI has been decreasing overall though there are rise every now and then.
# However the trend is going down with time.
# 2) Winter months have a strong seasonal effect with Nov and Dec being the peak months
# usually which likely could be due to cold temperatures causing pollutant to
# not escape the lower atmosphere.
# Link: https://www.accuweather.com/en/health-wellness/why-air-pollution-is-worse-in-winter/689434#:~:text=Cold%20air%20is%20denser%20and%20moves%20slower%20than%20warm%20air,rate%20than%20during%20the%20summer.
# 3) We can see a seasonal cycle of 12 months where the mean value of each month starts
# with a increasing trend in the beginning of the year and drops down towards
# the end of the year. We can see a seasonal effect with a cycle of 12 months.
# Understand the seasonality and remove it to see the trend
train_monthly_diff <- diff(train_monthly, lag = 12)
autoplot(train_monthly_diff, ylab="Train Seasonal Differencing (Monthly)")
# Let's look if the series is stationary?
kpss.test(train_monthly_diff)
## Warning in kpss.test(train_monthly_diff): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: train_monthly_diff
## KPSS Level = 0.14573, Truncation lag parameter = 5, p-value = 0.1
# The reported p-value is 0.1, which is > 0.05, and would suggest that we fail
# to reject the null hypothesis of level stationarity (conclusion: stationary)
adf.test(train_monthly_diff)
## Warning in adf.test(train_monthly_diff): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: train_monthly_diff
## Dickey-Fuller = -5.1056, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
# The reported p-value of 0.01 (<0.05) so we do reject the null hypothesis that
# the time series is non-stationary (conclusion: stationary)
acf(train_monthly_diff, main = "ACF of Seasonal Differencing Time Series")
pacf(train_monthly_diff, main = "PACF of Seasonal Differencing Time Series")
# Forecast the seasonal naïve for (2021-01 to 2023-07)
forecast_snaive_monthly <- snaive(train_monthly, h=31)
# Plot the forecasts for snaive model
plot(forecast_snaive_monthly, main = "PM 2.5 AQI Forecast - SNaive",
xlab = "Week", ylab = "PM 2.5 AQI")
lines(test_monthly)
# Compare the forecast with the actual test data by calculating MAPE and MSE
# Mean Absolute Percentage Error (MAPE)
mape_snaive <- mean(abs((test_monthly - forecast_snaive_monthly$mean)/test_monthly))
mape_snaive
## [1] 0.20778
# Mean Squared Error (MSE)
mse_snaive <- mean((test_monthly - forecast_snaive_monthly$mean)^2)
mse_snaive
## [1] 282.8901
# Evaluate the residuals
checkresiduals(forecast_snaive_monthly)
##
## Ljung-Box test
##
## data: Residuals from Seasonal naive method
## Q* = 82.387, df = 24, p-value = 2.523e-08
##
## Model df: 0. Total lags used: 24
vec <- as.vector(train_weekly)
univariate_ts <- ts(vec, start=1999, frequency=52.18)
stl_decomposition <- stl(univariate_ts, s.window="periodic")
plot(stl_decomposition)
h <- length(test_weekly)
model_tbats <- tbats(train_weekly)
forecast_tbats <- forecast(model_tbats, h=h)
plot(forecast_tbats)
checkresiduals(model_tbats)
##
## Ljung-Box test
##
## data: Residuals from TBATS
## Q* = 128.92, df = 104.36, p-value = 0.05179
##
## Model df: 0. Total lags used: 104.36
#Ljung-Box test .05179
plot(pm2_5_weekly_ts, xlim=c(1999, 2023), ylim=c(min(pm2_5_weekly_ts), max(pm2_5_weekly_ts)), main="Train, Test, and Forecasted Data - Weekly", xlab="Year", ylab="PM2.5 AQI Value")
lines(test_weekly, col="red") # Test data in red
lines(forecast_tbats$mean, col="blue", type="o") # Forecasted data in blue
legend("topleft", legend=c("Train", "Test", "Forecast"), fill=c("black", "red", "blue"))
mae <- mean(abs(test_weekly - forecast_tbats$mean))
print(cat("Mean Absolute Error (MAE):", round(mae, 2), "\n"))
## Mean Absolute Error (MAE): 9.25
## NULL
#Mean Absolute Error (MAE): 9.31
comparison_df <- data.frame(
Date = time(test_weekly),
Actual = as.vector(test_weekly),
Forecasted = as.vector(forecast_tbats$mean)
)
print(comparison_df)
## Date Actual Forecasted
## 1 2021.001 91.57143 79.76285
## 2 2021.020 83.28571 81.90778
## 3 2021.039 77.28571 80.15252
## 4 2021.058 52.57143 75.73405
## 5 2021.077 46.42857 69.93137
## 6 2021.097 65.57143 64.53847
## 7 2021.116 64.57143 60.93566
## 8 2021.135 54.57143 59.52249
## 9 2021.154 58.42857 59.74437
## 10 2021.173 52.14286 60.36729
## 11 2021.192 43.85714 60.06514
## 12 2021.212 45.14286 58.25418
## 13 2021.231 50.85714 55.52292
## 14 2021.250 60.42857 53.17368
## 15 2021.269 63.14286 52.36580
## 16 2021.288 62.71429 53.55979
## 17 2021.307 55.00000 56.32865
## 18 2021.327 54.00000 59.41406
## 19 2021.346 68.14286 61.25017
## 20 2021.365 62.85714 60.99757
## 21 2021.384 56.14286 59.24536
## 22 2021.403 63.14286 57.57820
## 23 2021.422 60.71429 57.52748
## 24 2021.442 52.71429 59.84829
## 25 2021.461 67.28571 64.22669
## 26 2021.480 55.85714 69.20070
## 27 2021.499 73.85714 72.60041
## 28 2021.518 81.85714 72.82056
## 29 2021.537 61.71429 70.03912
## 30 2021.557 68.85714 66.01692
## 31 2021.576 59.14286 62.75970
## 32 2021.595 63.28571 61.47326
## 33 2021.614 58.14286 62.33043
## 34 2021.633 64.00000 64.64065
## 35 2021.652 71.42857 67.16719
## 36 2021.672 69.42857 68.67033
## 37 2021.691 63.42857 68.55306
## 38 2021.710 62.85714 67.13541
## 39 2021.729 70.14286 65.32834
## 40 2021.748 57.85714 64.06076
## 41 2021.767 52.57143 63.90400
## 42 2021.787 67.00000 64.98187
## 43 2021.806 50.14286 67.02575
## 44 2021.825 61.00000 69.47144
## 45 2021.844 107.14286 71.60396
## 46 2021.863 73.42857 72.79015
## 47 2021.882 98.85714 72.76263
## 48 2021.901 63.42857 71.80027
## 49 2021.921 125.71429 70.64740
## 50 2021.940 82.57143 70.18538
## 51 2021.959 65.00000 71.02913
## 52 2021.978 74.42857 73.19265
## 53 2021.997 58.14286 75.89627
## 54 2022.016 83.71429 77.68823
## 55 2022.036 64.42857 77.12026
## 56 2022.055 64.57143 73.75393
## 57 2022.074 63.71429 68.59693
## 58 2022.093 69.57143 63.44149
## 59 2022.112 64.71429 59.78428
## 60 2022.131 58.85714 58.21952
## 61 2022.151 49.14286 58.38302
## 62 2022.170 55.42857 59.16884
## 63 2022.189 56.14286 59.23610
## 64 2022.208 51.71429 57.82145
## 65 2022.227 57.42857 55.30064
## 66 2022.246 50.57143 52.89422
## 67 2022.266 63.42857 51.83215
## 68 2022.285 44.57143 52.72377
## 69 2022.304 43.85714 55.31528
## 70 2022.323 62.71429 58.49065
## 71 2022.342 61.14286 60.68321
## 72 2022.361 49.28571 60.84876
## 73 2022.381 56.42857 59.31154
## 74 2022.400 58.85714 57.54305
## 75 2022.419 60.14286 57.14814
## 76 2022.438 61.28571 59.05077
## 77 2022.457 56.28571 63.14531
## 78 2022.476 62.00000 68.17263
## 79 2022.496 59.14286 72.02334
## 80 2022.515 76.14286 72.87993
## 81 2022.534 56.14286 70.56173
## 82 2022.553 57.85714 66.62489
## 83 2022.572 53.00000 63.12852
## 84 2022.591 50.28571 61.45866
## 85 2022.611 56.85714 61.96773
## 86 2022.630 57.42857 64.10343
## 87 2022.649 56.85714 66.69659
## 88 2022.668 62.28571 68.45928
## 89 2022.687 53.00000 68.64162
## 90 2022.706 50.85714 67.40666
## 91 2022.726 40.71429 65.60068
## 92 2022.745 56.71429 64.18580
## 93 2022.764 65.14286 63.81107
## 94 2022.783 58.00000 64.67913
## 95 2022.802 54.42857 66.58590
## 96 2022.821 58.85714 69.01204
## 97 2022.841 52.85714 71.25557
## 98 2022.860 50.57143 72.64784
## 99 2022.879 64.42857 72.83669
## 100 2022.898 77.71429 71.99981
## 101 2022.917 60.28571 70.81647
## 102 2022.936 54.28571 70.17340
## 103 2022.956 61.28571 70.76115
## 104 2022.975 99.00000 72.72148
## 105 2022.994 51.42857 75.42154
## 106 2023.013 48.42857 77.49853
## 107 2023.032 51.00000 77.42970
## 108 2023.051 58.00000 74.53271
## 109 2023.071 61.85714 69.57444
## 110 2023.090 60.14286 64.28845
## 111 2023.109 54.71429 60.28911
## 112 2023.128 55.42857 58.34891
## 113 2023.147 47.71429 58.26621
## 114 2023.166 48.14286 59.04205
## 115 2023.186 39.42857 59.32318
## 116 2023.205 48.57143 58.18239
## 117 2023.224 42.42857 55.78515
## 118 2023.243 49.14286 53.25692
## 119 2023.262 49.14286 51.88511
## 120 2023.281 63.85714 52.41753
## 121 2023.300 61.85714 54.76253
## 122 2023.320 70.28571 57.94557
## 123 2023.339 31.71429 60.42250
## 124 2023.358 52.42857 60.97324
## 125 2023.377 63.71429 59.65558
## 126 2023.396 44.57143 57.80028
## 127 2023.415 46.00000 57.06366
## 128 2023.435 38.42857 58.52809
## 129 2023.454 46.85714 62.28755
## 130 2023.473 50.85714 67.28797
## 131 2023.492 62.14286 71.51375
## 132 2023.511 88.85714 72.98177
## 133 2023.530 58.71429 71.15887
## 134 2023.550 60.42857 67.35630
## 135 2023.569 65.14286 63.64954
## 136 2023.588 62.14286 61.59527
## 137 2023.607 54.60000 61.72969
vec <- as.vector(train_monthly)
univariate_ts <- ts(vec, start=1999, frequency=52.18)
stl_decomposition <- stl(univariate_ts, s.window="periodic")
plot(stl_decomposition)
h <- length(test_monthly)
model_tbats <- tbats(train_monthly)
forecast_tbats <- forecast(model_tbats, h=h)
plot(forecast_tbats)
checkresiduals(model_tbats)
##
## Ljung-Box test
##
## data: Residuals from TBATS
## Q* = 34.295, df = 24, p-value = 0.07956
##
## Model df: 0. Total lags used: 24
#Ljung-Box test .07956
plot(pm2_5_monthly_ts, xlim=c(1999, 2023), ylim=c(min(pm2_5_monthly_ts), max(pm2_5_monthly_ts)), main="Train, Test, and Forecasted Data - Monthly", xlab="Year", ylab="PM2.5 AQI Value")
lines(test_monthly, col="red") # Test data in red
lines(forecast_tbats$mean, col="blue", type="o") # Forecasted data in blue
legend("topleft", legend=c("Train", "Test", "Forecast"), fill=c("black", "red", "blue"))
mae <- mean(abs(test_monthly - forecast_tbats$mean))
print(cat("Mean Absolute Error (MAE):", round(mae, 2), "\n"))
## Mean Absolute Error (MAE): 6.7
## NULL
#Mean Absolute Error (MAE): 6.61
comparison_df <- data.frame(
Date = time(test_monthly),
Actual = as.vector(test_monthly),
Forecasted = as.vector(forecast_tbats$mean)
)
print(comparison_df)
## Date Actual Forecasted
## 1 2021.000 69.25806 70.59917
## 2 2021.083 60.78571 62.21158
## 3 2021.167 49.09677 56.78048
## 4 2021.250 58.90000 56.35278
## 5 2021.333 62.45161 60.29386
## 6 2021.417 59.50000 65.34224
## 7 2021.500 69.90323 67.73135
## 8 2021.583 64.67742 67.04131
## 9 2021.667 64.93333 66.37212
## 10 2021.750 57.61290 68.28974
## 11 2021.833 86.50000 71.88886
## 12 2021.917 80.35484 73.11090
## 13 2022.000 69.90323 69.00588
## 14 2022.083 60.14286 61.88329
## 15 2022.167 53.48387 56.71088
## 16 2022.250 53.73333 56.33676
## 17 2022.333 56.87097 60.28989
## 18 2022.417 60.23333 65.34124
## 19 2022.500 60.32258 67.73111
## 20 2022.583 55.70968 67.04126
## 21 2022.667 52.30000 66.37211
## 22 2022.750 58.70968 68.28973
## 23 2022.833 61.63333 71.88886
## 24 2022.917 66.22581 73.11090
## 25 2023.000 53.80645 69.00588
## 26 2023.083 54.42857 61.88329
## 27 2023.167 44.80645 56.71088
## 28 2023.250 60.83333 56.33676
## 29 2023.333 46.96774 60.28989
## 30 2023.417 48.60000 65.34124
## 31 2023.500 68.41935 67.73111
## 32 2023.583 58.36364 67.04126
eacf <-
function (z,ar.max=7,ma.max=13)
{
#
# PROGRAMMED BY K.S. CHAN, DEPARTMENT OF STATISTICS AND ACTUARIAL SCIENCE,
# UNIVERSITY OF IOWA.
#
# DATE: 4/2001
# Compute the extended sample acf (ESACF) for the time series stored in z.
# The matrix of ESACF with the AR order up to ar.max and the MA order
# up to ma.max is stored in the matrix EACFM.
# The default values for NAR and NMA are 7 and 13 respectively.
# Side effect of the eacf function:
# The function prints a coded ESACF table with
# significant values denoted by * and nosignificant values by 0, significance
# level being 5%.
#
# Output:
# eacf=matrix of esacf
# symbol=matrix of coded esacf
#
lag1<-function(z,lag=1){c(rep(NA,lag),z[1:(length(z)-lag)])}
reupm<-function(m1,nrow,ncol){
k<-ncol-1
m2<-NULL
for (i in 1:k){
i1<-i+1
work<-lag1(m1[,i])
work[1]<--1
temp<-m1[,i1]-work*m1[i1,i1]/m1[i,i]
temp[i1]<-0
m2<-cbind(m2,temp)
}
m2}
ceascf<-function(m,cov1,nar,ncol,count,ncov,z,zm){
result<-0*seq(1,nar+1)
result[1]<-cov1[ncov+count]
for (i in 1:nar) {
temp<-cbind(z[-(1:i)],zm[-(1:i),1:i])%*%c(1,-m[1:i,i])
result[i+1]<-acf(temp,plot=FALSE,lag.max=count,drop.lag.0=FALSE)$acf[count+1]
}
result
}
ar.max<-ar.max+1
ma.max<-ma.max+1
nar<-ar.max-1
nma<-ma.max
ncov<-nar+nma+2
nrow<-nar+nma+1
ncol<-nrow-1
z<-z-mean(z)
zm<-NULL
for(i in 1:nar) zm<-cbind(zm,lag1(z,lag=i))
cov1<-acf(z,lag.max=ncov,plot=FALSE,drop.lag.0=FALSE)$acf
cov1<-c(rev(cov1[-1]),cov1)
ncov<-ncov+1
m1<-matrix(0,ncol=ncol,nrow=nrow)
for(i in 1:ncol) m1[1:i,i]<-
ar.ols(z,order.max=i,aic=FALSE,demean=FALSE,intercept=FALSE)$ar
eacfm<-NULL
for (i in 1:nma) {
m2<-reupm(m1=m1,nrow=nrow,ncol=ncol)
ncol<-ncol-1
eacfm<-cbind(eacfm, ceascf(m2,cov1,nar,ncol,i,ncov,z,zm))
m1<-m2}
work<-1:(nar+1)
work<-length(z)-work+1
symbol<-NULL
for ( i in 1:nma) {
work<-work-1
symbol<-cbind(symbol,ifelse(abs(eacfm[,i])>2/work^.5, 'x','o'))}
rownames(symbol)<-0:(ar.max-1)
colnames(symbol)<-0:(ma.max-1)
cat('AR/MA\n')
print(symbol,quote=FALSE)
invisible(list(eacf=eacfm,ar.max=ar.max,ma.ma=ma.max,symbol=symbol))
}
eacf(train_weekly)
## AR/MA
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x x x x x x x x o o o
## 1 x x o o o o o x x o x o o o
## 2 x x o o o o o o x x x o o o
## 3 x x o o o o o o o o o o o o
## 4 x x x o o o o o o o o o o o
## 5 x x x x o o o o o o o o o o
## 6 x x o x o o o o o o o o o o
## 7 x x o x o x o o o o o x o o
# p <- 1,2,3
# q <- 2,3,4
kpss_test_result <- kpss.test(train_weekly_diff)
## Warning in kpss.test(train_weekly_diff): p-value greater than printed p-value
adf_test_result <- adf.test(train_weekly_diff)
## Warning in adf.test(train_weekly_diff): p-value smaller than printed p-value
cat("KPSS: ", round(kpss_test_result$p.value, 2), "\n")
## KPSS: 0.1
cat("ADF: ", round(adf_test_result$p.value, 2), "\n")
## ADF: 0.01
# Fit ARIMA model using auto.arima()
# Find appropriate values for d and D using ndiffs and nsdiffs
# d <- ndiffs(train_weekly_diff)
# D <- nsdiffs(train_weekly_diff)
# Fit ARIMA model using auto.arima() with seasonal components and previously found d and D
arima_model_weekly <- auto.arima(train_weekly, seasonal = TRUE, stepwise = FALSE,
approximation = FALSE, trace = FALSE, lambda = "auto",
start.p = 1, max.p = 3,
start.q = 2, max.q = 4)
# auto.arima(pm2_5_weekly_diff, lambda = "auto", seasonal=TRUE, )
# Forecast using ARIMA model
h <- length(test_weekly)
forecast_arima_weekly <- forecast(arima_model_weekly, h = h)
plot(forecast_arima_weekly, main = "PM 2.5 AQI Forecast - sARIMA",
xlab = "Week", ylab = "PM 2.5 AQI")
lines(test_weekly)
# Check the residuals of the ARIMA model
checkresiduals(arima_model_weekly)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,2)
## Q* = 187.38, df = 101.36, p-value = 4.411e-07
##
## Model df: 3. Total lags used: 104.36
# Extracting forecasted values
forecasted_values <- forecast_arima_weekly$mean
# Calculate MAE
mae <- mean(abs(forecasted_values - test_weekly))
# Calculate MAPE
mape <- mean(abs((forecasted_values - test_weekly) / test_weekly)) * 100
# Get AICc and BIC values from the ARIMA model
aic <- arima_model_weekly$aic
bic <- arima_model_weekly$bic
# Corrected AICc calculation
n <- length(train_weekly_diff)
k <- length(arima_model_weekly$coef)
aic_corrected <- aic + 2 * (k + 1) * (k + 2) / (n - k - 2)
# Print the results
cat("MAE: ", round(mae, 2), "\n")
## MAE: 9.46
cat("MAPE: ", round(mape, 2), "%\n")
## MAPE: 16.5 %
cat("AICc: ", round(aic_corrected, 2), "\n")
## AICc: -6034.44
cat("BIC: ", round(bic, 2), "\n")
## BIC: -6014.3
# Create a time index for the forecasted values
forecast_time_index <- seq(from = end(test_weekly) + 1 / 52,
length.out = length(forecasted_values), by = 1 / 52)
# Plot historical data, actual test, and forecast
plot(train_weekly_diff, type = "l", col = "black",
xlab = "Week", ylab = "PM 2.5 AQI",
main = "PM 2.5 AQI Forecast - sARIMA")
lines(test_weekly, col = "blue")
lines(forecast_time_index, forecasted_values, col = "red")
# Adding legend
legend("topright", legend = c("Historical Data", "Actual Test", "Forecast"),
col = c("black", "blue", "red"), lty = 1)
comparison_df <- data.frame(
Date = time(test_weekly),
Actual = as.vector(test_weekly),
Forecasted = as.vector(forecasted_values)
)
print(comparison_df)
## Date Actual Forecasted
## 1 2021.001 91.57143 70.55457
## 2 2021.020 83.28571 68.49195
## 3 2021.039 77.28571 67.04783
## 4 2021.058 52.57143 66.02520
## 5 2021.077 46.42857 65.29516
## 6 2021.097 65.57143 64.77095
## 7 2021.116 64.57143 64.39297
## 8 2021.135 54.57143 64.11960
## 9 2021.154 58.42857 63.92145
## 10 2021.173 52.14286 63.77760
## 11 2021.192 43.85714 63.67305
## 12 2021.212 45.14286 63.59699
## 13 2021.231 50.85714 63.54164
## 14 2021.250 60.42857 63.50133
## 15 2021.269 63.14286 63.47196
## 16 2021.288 62.71429 63.45057
## 17 2021.307 55.00000 63.43498
## 18 2021.327 54.00000 63.42361
## 19 2021.346 68.14286 63.41533
## 20 2021.365 62.85714 63.40929
## 21 2021.384 56.14286 63.40489
## 22 2021.403 63.14286 63.40168
## 23 2021.422 60.71429 63.39934
## 24 2021.442 52.71429 63.39764
## 25 2021.461 67.28571 63.39640
## 26 2021.480 55.85714 63.39549
## 27 2021.499 73.85714 63.39483
## 28 2021.518 81.85714 63.39435
## 29 2021.537 61.71429 63.39400
## 30 2021.557 68.85714 63.39374
## 31 2021.576 59.14286 63.39355
## 32 2021.595 63.28571 63.39342
## 33 2021.614 58.14286 63.39332
## 34 2021.633 64.00000 63.39325
## 35 2021.652 71.42857 63.39319
## 36 2021.672 69.42857 63.39315
## 37 2021.691 63.42857 63.39313
## 38 2021.710 62.85714 63.39311
## 39 2021.729 70.14286 63.39309
## 40 2021.748 57.85714 63.39308
## 41 2021.767 52.57143 63.39307
## 42 2021.787 67.00000 63.39307
## 43 2021.806 50.14286 63.39306
## 44 2021.825 61.00000 63.39306
## 45 2021.844 107.14286 63.39306
## 46 2021.863 73.42857 63.39306
## 47 2021.882 98.85714 63.39305
## 48 2021.901 63.42857 63.39305
## 49 2021.921 125.71429 63.39305
## 50 2021.940 82.57143 63.39305
## 51 2021.959 65.00000 63.39305
## 52 2021.978 74.42857 63.39305
## 53 2021.997 58.14286 63.39305
## 54 2022.016 83.71429 63.39305
## 55 2022.036 64.42857 63.39305
## 56 2022.055 64.57143 63.39305
## 57 2022.074 63.71429 63.39305
## 58 2022.093 69.57143 63.39305
## 59 2022.112 64.71429 63.39305
## 60 2022.131 58.85714 63.39305
## 61 2022.151 49.14286 63.39305
## 62 2022.170 55.42857 63.39305
## 63 2022.189 56.14286 63.39305
## 64 2022.208 51.71429 63.39305
## 65 2022.227 57.42857 63.39305
## 66 2022.246 50.57143 63.39305
## 67 2022.266 63.42857 63.39305
## 68 2022.285 44.57143 63.39305
## 69 2022.304 43.85714 63.39305
## 70 2022.323 62.71429 63.39305
## 71 2022.342 61.14286 63.39305
## 72 2022.361 49.28571 63.39305
## 73 2022.381 56.42857 63.39305
## 74 2022.400 58.85714 63.39305
## 75 2022.419 60.14286 63.39305
## 76 2022.438 61.28571 63.39305
## 77 2022.457 56.28571 63.39305
## 78 2022.476 62.00000 63.39305
## 79 2022.496 59.14286 63.39305
## 80 2022.515 76.14286 63.39305
## 81 2022.534 56.14286 63.39305
## 82 2022.553 57.85714 63.39305
## 83 2022.572 53.00000 63.39305
## 84 2022.591 50.28571 63.39305
## 85 2022.611 56.85714 63.39305
## 86 2022.630 57.42857 63.39305
## 87 2022.649 56.85714 63.39305
## 88 2022.668 62.28571 63.39305
## 89 2022.687 53.00000 63.39305
## 90 2022.706 50.85714 63.39305
## 91 2022.726 40.71429 63.39305
## 92 2022.745 56.71429 63.39305
## 93 2022.764 65.14286 63.39305
## 94 2022.783 58.00000 63.39305
## 95 2022.802 54.42857 63.39305
## 96 2022.821 58.85714 63.39305
## 97 2022.841 52.85714 63.39305
## 98 2022.860 50.57143 63.39305
## 99 2022.879 64.42857 63.39305
## 100 2022.898 77.71429 63.39305
## 101 2022.917 60.28571 63.39305
## 102 2022.936 54.28571 63.39305
## 103 2022.956 61.28571 63.39305
## 104 2022.975 99.00000 63.39305
## 105 2022.994 51.42857 63.39305
## 106 2023.013 48.42857 63.39305
## 107 2023.032 51.00000 63.39305
## 108 2023.051 58.00000 63.39305
## 109 2023.071 61.85714 63.39305
## 110 2023.090 60.14286 63.39305
## 111 2023.109 54.71429 63.39305
## 112 2023.128 55.42857 63.39305
## 113 2023.147 47.71429 63.39305
## 114 2023.166 48.14286 63.39305
## 115 2023.186 39.42857 63.39305
## 116 2023.205 48.57143 63.39305
## 117 2023.224 42.42857 63.39305
## 118 2023.243 49.14286 63.39305
## 119 2023.262 49.14286 63.39305
## 120 2023.281 63.85714 63.39305
## 121 2023.300 61.85714 63.39305
## 122 2023.320 70.28571 63.39305
## 123 2023.339 31.71429 63.39305
## 124 2023.358 52.42857 63.39305
## 125 2023.377 63.71429 63.39305
## 126 2023.396 44.57143 63.39305
## 127 2023.415 46.00000 63.39305
## 128 2023.435 38.42857 63.39305
## 129 2023.454 46.85714 63.39305
## 130 2023.473 50.85714 63.39305
## 131 2023.492 62.14286 63.39305
## 132 2023.511 88.85714 63.39305
## 133 2023.530 58.71429 63.39305
## 134 2023.550 60.42857 63.39305
## 135 2023.569 65.14286 63.39305
## 136 2023.588 62.14286 63.39305
## 137 2023.607 54.60000 63.39305
summary(arima_model_weekly)
## Series: train_weekly
## ARIMA(1,1,2)
## Box Cox transformation: lambda= -0.5880568
##
## Coefficients:
## ar1 ma1 ma2
## 0.7291 -1.3756 0.3819
## s.e. 0.0568 0.0791 0.0776
##
## sigma^2 = 0.0003005: log likelihood = 3021.24
## AIC=-6034.48 AICc=-6034.44 BIC=-6014.3
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1.61547 15.94283 11.5259 -1.993959 16.06187 0.7131783 0.02621965
# Assuming your residuals are stored in a variable called "residuals"
shapiro_test_result <- shapiro.test(arima_model_weekly$residuals)
# Print the test result
print(shapiro_test_result)
##
## Shapiro-Wilk normality test
##
## data: arima_model_weekly$residuals
## W = 0.99235, p-value = 1.168e-05
# Check the p-value
if (shapiro_test_result$p.value < 0.05) {
cat("The residuals are not normally distributed (p-value:", shapiro_test_result$p.value, ")\n")
} else {
cat("The residuals appear to be normally distributed (p-value:", shapiro_test_result$p.value, ")\n")
}
## The residuals are not normally distributed (p-value: 1.167989e-05 )
# Convert monthly data to a time series object
# pm2_5_monthly_ts <- ts(pm2_5_monthly["19990131/20230811"], start = c(1999, 1), frequency = 12)
y_values_monthly <- train_monthly
optimal_lambda <- BoxCox.lambda(y_values_monthly)
# Apply the Box-Cox transformation
transformed_data_monthly <- if (optimal_lambda != 0) {
(y_values_monthly^optimal_lambda - 1) / optimal_lambda
} else {
log(y_values_monthly)
}
# Check stationarity, perform differencing if needed
kpss_test_result <- kpss.test(pm2_5_monthly_ts)
## Warning in kpss.test(pm2_5_monthly_ts): p-value smaller than printed p-value
adf_test_result <- adf.test(pm2_5_monthly_ts)
## Warning in adf.test(pm2_5_monthly_ts): p-value smaller than printed p-value
if (kpss_test_result$p.value < 0.05 || adf_test_result$p.value < 0.05) {
pm2_5_monthly_diff <- diff(pm2_5_monthly_ts)
} else {
pm2_5_monthly_diff <- pm2_5_monthly_ts
}
# Check stationarity, perform differencing if needed
kpss_test_result <- kpss.test(transformed_data_monthly)
## Warning in kpss.test(transformed_data_monthly): p-value smaller than printed
## p-value
adf_test_result <- adf.test(transformed_data_monthly)
## Warning in adf.test(transformed_data_monthly): p-value smaller than printed
## p-value
if (kpss_test_result$p.value < 0.05 || adf_test_result$p.value < 0.05) {
pm2_5_monthly_diff <- diff(transformed_data_monthly)
# Check stationarity after first differencing
kpss_test_result_diff <- kpss.test(pm2_5_monthly_diff)
adf_test_result_diff <- adf.test(pm2_5_monthly_diff)
if (kpss_test_result_diff$p.value < 0.05 || adf_test_result_diff$p.value < 0.05) {
# If still non-stationary, apply second-order differencing
pm2_5_monthly_diff <- diff(pm2_5_monthly_diff)
print("First and Second Order Differencing was performed")
}
} else {
pm2_5_monthly_diff <- transformed_data_monthly
print("First Order Differencing was performed")
}
## Warning in kpss.test(pm2_5_monthly_diff): p-value greater than printed p-value
## Warning in adf.test(pm2_5_monthly_diff): p-value smaller than printed p-value
## [1] "First and Second Order Differencing was performed"
# Fit ARIMA model using auto.arima()
arima_model_monthly <- auto.arima(pm2_5_monthly_diff, seasonal = TRUE)
# Forecast using ARIMA model
h <- length(test_monthly)
forecast_arima_monthly <- forecast(arima_model_monthly, h = h)
original_scale_forecasts_monthly <- InvBoxCox(forecast_arima_monthly$mean, lambda = optimal_lambda)
print(cat("Optimal Lambda Value:", optimal_lambda, "\n"))
## Optimal Lambda Value: -0.09958836
## NULL
# Plot the original data
original_scale_forecasts_monthly <- original_scale_forecasts_monthly * 100
plot(train_monthly, main = "Original Data and Forecasts")
lines(original_scale_forecasts_monthly, col = "blue") # Add forecast line in blue
# Add legend
legend("topleft", legend = c("Original Data", "Forecasts"), col = c("black", "blue"), lwd = 1)
checkresiduals(arima_model_monthly)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,0,2)(0,0,2)[12] with non-zero mean
## Q* = 35.946, df = 17, p-value = 0.004663
##
## Model df: 7. Total lags used: 24
#Ljung-Box test .05179
plot(pm2_5_monthly_diff, xlim=c(1999, 2023), ylim=c(min(pm2_5_monthly_diff), max(pm2_5_monthly_diff)), main="Train, Test, and Forecasted Data - Monthly", xlab="Year", ylab="PM2.5 AQI Value")
lines(test_monthly, col="red") # Test data in red
lines(original_scale_forecasts_monthly, col="blue", type="o") # Forecasted data in blue
legend("topleft", legend=c("Train", "Test", "Forecast"), fill=c("black", "red", "blue"))
mae <- mean(abs(test_monthly - original_scale_forecasts_monthly))
print(cat("Mean Absolute Error (MAE):", round(mae, 2), "\n"))
## Mean Absolute Error (MAE): 39.61
## NULL
#Mean Absolute Error (MAE): 9.31
comparison_df <- data.frame(
Date = time(test_monthly),
Actual = as.vector(test_monthly),
Forecasted = as.vector(original_scale_forecasts_monthly)
)
print(comparison_df)
## Date Actual Forecasted
## 1 2021.000 69.25806 88.03185
## 2 2021.083 60.78571 99.66849
## 3 2021.167 49.09677 100.36669
## 4 2021.250 58.90000 110.35343
## 5 2021.333 62.45161 99.55594
## 6 2021.417 59.50000 97.38366
## 7 2021.500 69.90323 105.60004
## 8 2021.583 64.67742 95.63940
## 9 2021.667 64.93333 104.14763
## 10 2021.750 57.61290 94.36865
## 11 2021.833 86.50000 99.47679
## 12 2021.917 80.35484 101.88149
## 13 2022.000 69.90323 99.58545
## 14 2022.083 60.14286 98.46090
## 15 2022.167 53.48387 97.13609
## 16 2022.250 53.73333 106.51176
## 17 2022.333 56.87097 100.76623
## 18 2022.417 60.23333 96.62587
## 19 2022.500 60.32258 103.50100
## 20 2022.583 55.70968 99.18585
## 21 2022.667 52.30000 101.98734
## 22 2022.750 58.70968 94.54031
## 23 2022.833 61.63333 100.31511
## 24 2022.917 66.22581 103.37849
## 25 2023.000 53.80645 97.26432
## 26 2023.083 54.42857 101.04354
## 27 2023.167 44.80645 100.17232
## 28 2023.250 60.83333 100.08646
## 29 2023.333 46.96774 100.05663
## 30 2023.417 48.60000 100.02717
## 31 2023.500 68.41935 100.01440
## 32 2023.583 58.36364 100.00776